{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "from urllib import request"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 数据地址\n",
    "# 注意这种长字符串的写法，可以借鉴\n",
    "target_url = (\"https://archive.ics.uci.edu/ml/machine-learning-\"\n",
    "\"databases/undocumented/connectionist-bench/sonar/sonar.all-data\")\n",
    "target_url"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "headers = {'User-Agent': \"Mozilla/5.0\"}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1.使用requests"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "resp = requests.get(target_url, headers=headers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_list = []\n",
    "labels = []\n",
    "\n",
    "for line in resp.text.split('\\n'):\n",
    "    # split on comma\n",
    "    if not line.strip():\n",
    "        continue\n",
    "    row = line.strip().split(\",\")\n",
    "    x_list.append(row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of Rows of Data:  208\n",
      "Number of Columns of Data:  61\n"
     ]
    }
   ],
   "source": [
    "print(\"Number of Rows of Data: \", len(x_list))\n",
    "print('Number of Columns of Data: ', len(x_list[0]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2.使用urlopen"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 简单的伪装请求"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "req = request.Request(target_url, headers=headers)\n",
    "with request.urlopen(req) as resp:\n",
    "    data = resp.read()\n",
    "\n",
    "# data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "req = request.Request(target_url)\n",
    "req.add_header('User-Agent', 'Mozilla/5.0')  # 注意这里传的不是字典\n",
    "with request.urlopen(req) as resp:\n",
    "    data = resp.read()\n",
    "\n",
    "# data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 使用简单的opener"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "opener = request.build_opener()\n",
    "opener.addheaders = [('User-Agent', 'Mozilla/5.0')]  # 注意这里传的不是字典"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "with opener.open(target_url) as resp:\n",
    "    data = resp.read()\n",
    "\n",
    "# data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 顺便使用下cookie"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "cookie介绍:  \n",
    "* HTTP是一个无状态协议，但是在访问网页时，有时我们需要保存会话信息，比如登录信息。  \n",
    "* 有两种比较常用的保存会话信息的方式：Cookie及Session  \n",
    "* 使用Cookie会将所有会话信息保存在客户端  \n",
    "* 使用Session会将会话信息保存在服务器端，但是服务器端会给客户端发SessionID等信息，这些信息一般存在客户端的Cookie中  \n",
    "* 有了Cookie，当我们登录成功后，爬取该网站的其他网页时，则会保存登录状态进行内容的爬取  \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "import http.cookiejar"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "req = request.Request(target_url)\n",
    "req.add_header('User-Agent', 'Mozilla/5.0')\n",
    "cjar = http.cookiejar.CookieJar()  # 创建cookiejar对象\n",
    "opener = request.build_opener(request.HTTPCookieProcessor(cjar), request.HTTPSHandler)  #使用HTTPCookieProcessor创建Cookie处理器(handler)，并以其为参数构建opener对象\n",
    "request.install_opener(opener)  # 将opener安装为全局\n",
    "\n",
    "with opener.open(req) as resp:\n",
    "    data = resp.read()\n",
    "\n",
    "# data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "如果需要代理ip，可以创建代理ip处理器：proxy_handler = request.ProxyHandler({'http':'211.167.248.228:8080'})  # 代理服务器的地址  \n",
    "然后在构建opener的时候，传进去。如：opener = urllib.request.build_opener(proxy_handler, urllib.request.HTTPHandler)  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3.综合使用文件和网络获取数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "def load_data(filepath: str=None, url: str=None):\n",
    "    # 如果文件存在，从文件导入数据\n",
    "    if filepath and os.path.isfiles(filepath):\n",
    "        with open(filepath) as f:\n",
    "            data = f.read()\n",
    "    elif url:\n",
    "        headers = {\"User-Agent\": \"Mozilla/5.0\"}\n",
    "        resp = requests.get(target_url, headers=headers)\n",
    "        data = resp.text\n",
    "    else:\n",
    "        raise ValueError(f\"参数：filepath和url必须设置一个\")\n",
    "    \n",
    "    return data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.使用封装大dataapi获取数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "from dataapi import DataApi"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>V0</th>\n",
       "      <th>V1</th>\n",
       "      <th>V2</th>\n",
       "      <th>V3</th>\n",
       "      <th>V4</th>\n",
       "      <th>V5</th>\n",
       "      <th>V6</th>\n",
       "      <th>V7</th>\n",
       "      <th>V8</th>\n",
       "      <th>V9</th>\n",
       "      <th>...</th>\n",
       "      <th>V51</th>\n",
       "      <th>V52</th>\n",
       "      <th>V53</th>\n",
       "      <th>V54</th>\n",
       "      <th>V55</th>\n",
       "      <th>V56</th>\n",
       "      <th>V57</th>\n",
       "      <th>V58</th>\n",
       "      <th>V59</th>\n",
       "      <th>V60</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0200</td>\n",
       "      <td>0.0371</td>\n",
       "      <td>0.0428</td>\n",
       "      <td>0.0207</td>\n",
       "      <td>0.0954</td>\n",
       "      <td>0.0986</td>\n",
       "      <td>0.1539</td>\n",
       "      <td>0.1601</td>\n",
       "      <td>0.3109</td>\n",
       "      <td>0.2111</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0027</td>\n",
       "      <td>0.0065</td>\n",
       "      <td>0.0159</td>\n",
       "      <td>0.0072</td>\n",
       "      <td>0.0167</td>\n",
       "      <td>0.0180</td>\n",
       "      <td>0.0084</td>\n",
       "      <td>0.0090</td>\n",
       "      <td>0.0032</td>\n",
       "      <td>R</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.0453</td>\n",
       "      <td>0.0523</td>\n",
       "      <td>0.0843</td>\n",
       "      <td>0.0689</td>\n",
       "      <td>0.1183</td>\n",
       "      <td>0.2583</td>\n",
       "      <td>0.2156</td>\n",
       "      <td>0.3481</td>\n",
       "      <td>0.3337</td>\n",
       "      <td>0.2872</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0084</td>\n",
       "      <td>0.0089</td>\n",
       "      <td>0.0048</td>\n",
       "      <td>0.0094</td>\n",
       "      <td>0.0191</td>\n",
       "      <td>0.0140</td>\n",
       "      <td>0.0049</td>\n",
       "      <td>0.0052</td>\n",
       "      <td>0.0044</td>\n",
       "      <td>R</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0262</td>\n",
       "      <td>0.0582</td>\n",
       "      <td>0.1099</td>\n",
       "      <td>0.1083</td>\n",
       "      <td>0.0974</td>\n",
       "      <td>0.2280</td>\n",
       "      <td>0.2431</td>\n",
       "      <td>0.3771</td>\n",
       "      <td>0.5598</td>\n",
       "      <td>0.6194</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0232</td>\n",
       "      <td>0.0166</td>\n",
       "      <td>0.0095</td>\n",
       "      <td>0.0180</td>\n",
       "      <td>0.0244</td>\n",
       "      <td>0.0316</td>\n",
       "      <td>0.0164</td>\n",
       "      <td>0.0095</td>\n",
       "      <td>0.0078</td>\n",
       "      <td>R</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0100</td>\n",
       "      <td>0.0171</td>\n",
       "      <td>0.0623</td>\n",
       "      <td>0.0205</td>\n",
       "      <td>0.0205</td>\n",
       "      <td>0.0368</td>\n",
       "      <td>0.1098</td>\n",
       "      <td>0.1276</td>\n",
       "      <td>0.0598</td>\n",
       "      <td>0.1264</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0121</td>\n",
       "      <td>0.0036</td>\n",
       "      <td>0.0150</td>\n",
       "      <td>0.0085</td>\n",
       "      <td>0.0073</td>\n",
       "      <td>0.0050</td>\n",
       "      <td>0.0044</td>\n",
       "      <td>0.0040</td>\n",
       "      <td>0.0117</td>\n",
       "      <td>R</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.0762</td>\n",
       "      <td>0.0666</td>\n",
       "      <td>0.0481</td>\n",
       "      <td>0.0394</td>\n",
       "      <td>0.0590</td>\n",
       "      <td>0.0649</td>\n",
       "      <td>0.1209</td>\n",
       "      <td>0.2467</td>\n",
       "      <td>0.3564</td>\n",
       "      <td>0.4459</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0031</td>\n",
       "      <td>0.0054</td>\n",
       "      <td>0.0105</td>\n",
       "      <td>0.0110</td>\n",
       "      <td>0.0015</td>\n",
       "      <td>0.0072</td>\n",
       "      <td>0.0048</td>\n",
       "      <td>0.0107</td>\n",
       "      <td>0.0094</td>\n",
       "      <td>R</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 61 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       V0      V1      V2      V3      V4      V5      V6      V7      V8  \\\n",
       "0  0.0200  0.0371  0.0428  0.0207  0.0954  0.0986  0.1539  0.1601  0.3109   \n",
       "1  0.0453  0.0523  0.0843  0.0689  0.1183  0.2583  0.2156  0.3481  0.3337   \n",
       "2  0.0262  0.0582  0.1099  0.1083  0.0974  0.2280  0.2431  0.3771  0.5598   \n",
       "3  0.0100  0.0171  0.0623  0.0205  0.0205  0.0368  0.1098  0.1276  0.0598   \n",
       "4  0.0762  0.0666  0.0481  0.0394  0.0590  0.0649  0.1209  0.2467  0.3564   \n",
       "\n",
       "       V9  ...     V51     V52     V53     V54     V55     V56     V57  \\\n",
       "0  0.2111  ...  0.0027  0.0065  0.0159  0.0072  0.0167  0.0180  0.0084   \n",
       "1  0.2872  ...  0.0084  0.0089  0.0048  0.0094  0.0191  0.0140  0.0049   \n",
       "2  0.6194  ...  0.0232  0.0166  0.0095  0.0180  0.0244  0.0316  0.0164   \n",
       "3  0.1264  ...  0.0121  0.0036  0.0150  0.0085  0.0073  0.0050  0.0044   \n",
       "4  0.4459  ...  0.0031  0.0054  0.0105  0.0110  0.0015  0.0072  0.0048   \n",
       "\n",
       "      V58     V59  V60  \n",
       "0  0.0090  0.0032    R  \n",
       "1  0.0052  0.0044    R  \n",
       "2  0.0095  0.0078    R  \n",
       "3  0.0040  0.0117    R  \n",
       "4  0.0107  0.0094    R  \n",
       "\n",
       "[5 rows x 61 columns]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_sonar = DataApi.get_sonar_data()\n",
    "df_sonar.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
